In this notebook, as project part of the Udacity DataScience nanodegree course, we will build out a number of different methods for making recommendations that can be used for different situations.
I. Exploratory Data Analysis
II. Rank Based Recommendations
III. User-User Based Collaborative Filtering
IV. Matrix Factorization
Let's get started by importing the necessary libraries and reading in the data.
#
# import libraries
#
import pandas as pd
import numpy as np
import project_tests as t
import pickle
from subprocess import call
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
# check python and pandas version
import sys
print("Python: {}".format(sys.version))
print("Pandas: {}".format(pd.__version__))
# load data
try:
df = pd.read_csv('data/user-item-interactions.csv')
df_content = pd.read_csv('data/articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']
except FileNotFoundError:
file1 = 'user-item-interactions.csv'
file2 = 'articles_community.csv'
print("The csv files {}, {} don't exist in the given directory. No analysis possible.".format(file1, file2))
# success
print("The user-item dataset has {} data points with {} variables each.".format(*df.shape))
print("The articles-community dataset has {} data points with {} variables each.".format(*df_content.shape))
# Show df to get an idea of the data
df.head()
# Show df_content to get an idea of the data
df_content.head()
Now, we provide some insight into the descriptive statistics of the data.
1. What is the distribution of how many articles a user interacts with in the dataset? Provide a visual and descriptive statistics to assist with giving a look at the number of times each user interacts with an article.
df_content.info()
sum(df_content.duplicated())
df.info()
# see: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html
# change df article_id to int64, so, both dataframes are using the same datatype
df = df.astype({'article_id': 'int64'})
df.info()
Note: As visible with this df information, there are 17 less emails compared to the number of article_id's and titles. So, there are 17 null email values.
sum(df.duplicated())
interaction_dup_idx = df.index[df.duplicated()].tolist()
# Don't remove them now, it does not fit to the task 3 below and the sol_1_dict test function !
# but in general, remove the duplicates of the interaction dataset
#df.drop_duplicates(inplace=True)
#df.info()
# result: Now, there 13 less emails, means not all articles are mapped to an email interaction.
#<class 'pandas.core.frame.DataFrame'>
#Int64Index: 33682 entries, 0 to 45992
#Data columns (total 3 columns):
# Column Non-Null Count Dtype
#--- ------ -------------- -----
# 0 article_id 33682 non-null int64
# 1 title 33682 non-null object
# 2 email 33669 non-null object
#dtypes: int64(1), object(2)
#memory usage: 1.0+ MB
print("There are {} different articles in the interaction dataset.".format(len(df['article_id'].unique())))
print("In the interaction dataset the smallest article number is: {}".format(min(df['article_id'].unique())))
print("In the interaction dataset the highest article number is: {}".format(max(df['article_id'].unique())))
print("There are {} different articles in the articles dataset.".format(len(df_content['article_id'].unique())))
print("In the article dataset the smallest article number is: {}".format(min(df['article_id'].unique())))
print("In the article dataset the highest article number is: {}".format(max(df['article_id'].unique())))
series_grouped1 = df.groupby(['article_id'])['email'].count()
series_grouped1[:20]
series_grouped1.values
max(series_grouped1.values)
series_grouped1.idxmax()
# visualise the article - user email count distribution
index_list = list(range(0, len(series_grouped1)))
interaction_dict = {
'article_id': list(series_grouped1.index),
'email_count': list(series_grouped1.values),
}
df_grouped1_interact = pd.DataFrame(data=interaction_dict,
columns=['article_id', 'email_count'], index=index_list)
df_grouped1_interact = df_grouped1_interact.nlargest(len(series_grouped1), 'email_count')
df_grouped1_interact.head(10)
# Figure dimension (width, height) in inches
df_grouped1_interact.plot.barh(x='article_id', y='email_count', figsize=[10,120])
plt.title("Article User Interaction Distribution")
plt.ylabel('article id')
plt.xlabel('amount of emails for each article')
plt.show()
series_grouped2 = df.groupby(['article_id', 'email'])['email'].count()
series_grouped2[:20]
Note: As we have investigated, for a separate article is may appear that there are emails that are linked several times to this article and not only once.
series_grouped3 = df.groupby(['email'])['article_id'].count()
series_grouped3
df.query("email == '0000b6387a0366322d7fbfc6434af145adf7fed1'")
# visualise the user email - article count distribution
index_list = list(range(0, len(series_grouped3)))
interaction_dict = {
'email': list(series_grouped3.index),
'article_id_count': list(series_grouped3.values),
}
df_grouped_interact = pd.DataFrame(data=interaction_dict,
columns=['email', 'article_id_count'], index=index_list)
df_grouped_interact = df_grouped_interact.nlargest(len(series_grouped3), 'article_id_count')
df_grouped_interact.head(40)
# Figure dimension (width, height) in inches
df_grouped_interact.plot.barh(x='email', y='article_id_count', figsize=[10,900])
plt.title("User Article Interaction Distribution")
plt.ylabel('email label')
plt.show()